In [19]:
import os
import pathlib
import pickle
import earthaccess
import earthpy as et
import earthpy.earthexplorer as etee
import geopandas as gpd
import geoviews as gv
import gitpass
import glob
import holoviews as hv
import hvplot.pandas
import hvplot.xarray
import numpy as np
import pandas as pd
import rasterio
import requests
import rioxarray as rxr
import rioxarray.merge as rxrm
import shapely
import xarray as xr
import xrspatial
from sklearn.cluster import KMeans
data_dir = os.path.join(et.io.HOME, et.io.DATA_NAME)
os.environ["GDAL_HTTP_MAX_RETRY"] = "5"
os.environ["GDAL_HTTP_RETRY_DELAY"] = "1"
In [2]:
# Caching decorator
def cached(key, override=False):
""""""
def compute_and_cache_decorator(compute_function):
""""""
def compute_and_cache(*args, **kwargs):
"""Perform a computation and cache, or load cached result"""
filename = os.path.join(et.io.HOME, et.io.DATA_NAME, 'jars', f'{key}.pickle')
# Check if the cache exists already or override caching
if not os.path.exists(filename) or override:
# Make jars directory if needed
os.makedirs(os.path.dirname(filename), exist_ok=True)
# Run the compute function as the user did
result = compute_function(*args, **kwargs)
# Pickle the object
with open(filename, 'wb') as file:
pickle.dump(result, file)
else:
# Unpickle the object
with open(filename, 'rb') as file:
result = pickle.load(file)
return result
return compute_and_cache
return compute_and_cache_decorator
In [7]:
#Download WBD spatial data
wbd_path = os.path.join(
data_dir,
'earthpy-downloads',
'WBD_18_HU2_Shape',
'Shape',
'WBDHU10.shp'
)
wbd_url = ("https://prd-tnm.s3.amazonaws.com/StagedProducts/"
"Hydrography/WBD/HU2/Shape/WBD_18_HU2_Shape.zip"
)
if not os.path.exists(wbd_path):
print('downloading ' + wbd_url)
wbd_zip = et.data.get_data(url=wbd_url)
else:
print(wbd_path + " already exists")
# Create a GDF and plot the HUC
wbd_gdf = gpd.read_file(wbd_path)
huc_gdf = wbd_gdf[wbd_gdf['huc10'] == '1809010202']
gv.tile_sources.EsriImagery() * gv.Polygons(huc_gdf).opts(
fill_alpha=0, line_color='blue', title='HUC 180901020204',
height=600, width=600
)
C:\Users\Pete\earth-analytics\data\earthpy-downloads\WBD_18_HU2_Shape\Shape\WBDHU10.shp already exists
Out[7]:
Site Description¶
HUC 1809010202 is a HUC 10 watershed located in the Eastern Sierra region of California. This watershed is on the east side of the Sierra Crest, a significant geographic boundary in the western United States. Multiple major land forms and features exist within this watershed including large, high elevation mountains, alpine forests, high desert, alpine lakes, and the town Mammoth Lakes, CA and the ski area Mammoth Mountain.
In [8]:
# Search multispectral data from Earthdata
earthaccess.login(persist=True)
results = earthaccess.search_data(
short_name="HLSL30",
cloud_hosted=True,
bounding_box=tuple(huc_gdf.total_bounds),
temporal=("2023-05-01", "2023-09-30"),
)
Granules found: 18
In [9]:
# Open the Earthaccess results so you only have to do it once
@cached("open_results", False)
def open_results_and_cache(results):
open_results = earthaccess.open(results)
return open_results
open_results = open_results_and_cache(results)
Opening 18 granules, approx size: 3.47 GB
QUEUEING TASKS | : 0%| | 0/270 [00:00<?, ?it/s]
PROCESSING TASKS | : 0%| | 0/270 [00:00<?, ?it/s]
COLLECTING RESULTS | : 0%| | 0/270 [00:00<?, ?it/s]
In [ ]:
url_results = [file_obj.url for file_obj in open_results]
In [ ]:
# Download all the tif files since I can't get rasterio to open them
def download_files(urls, directory):
for url in urls:
# Extract filename from URL
filename = url.split('/')[-1]
# Construct path to save file
filepath = os.path.join(directory, filename)
# Check if file already exists
if os.path.exists(filepath):
print(f"{filename} already exists. Skipping download.")
continue
# Download the file
print(f"Downloading {filename}...")
response = requests.get(url)
# Check if download was successful
if response.status_code == 200:
# Save the file
with open(filepath, 'wb') as file:
file.write(response.content)
print(f"Downloaded {filename} successfully!")
else:
print(f"Failed to download {filename}.")
dl_path = os.path.join(
data_dir,
'earthpy-downloads'
)
download_files(url_results, dl_path)
In [83]:
# Process the granules from Earthdata
def create_granule_df(earthdata_results, open_results):
"""
Process the granules returned by an Earthdata search
Parameters
==========
earthdata_results : list
A list object returned by earthacces.search_data().
This contains the result granules from the Earthdata search.
open_results : opened earthdata_results object
Returns
=======
granule_df : DataFrame
A dataframe with a row for each granule file.
Each row will contain the granule ID, the download URL,
the datetime, the tile ID, and the band.
"""
columns = ['geometry','granule_id', 'file_url', 'tile_id', 'datetime', 'band_number']
granule_df = gpd.GeoDataFrame(columns=columns)
# Loop through each file in the granule results and accumulate attributes
for url in open_results:
file_url = str(url.url)
# Should probably use regular expression here
gran_id = file_url[73:107]
tile_id = file_url[81:87]
datetime = file_url[88:95]
band = file_url[-7:-4]
if band == 'ask':
band = 'Fmask'
# Find granule geometry based on granule ID
for result in earthdata_results:
result_attr = result['umm']
result_gran_id = result_attr['GranuleUR']
if result_gran_id == gran_id:
extent = result_attr['SpatialExtent']
geom_extent=extent['HorizontalSpatialDomain']['Geometry']['GPolygons']
coord_list = []
for item in geom_extent:
poly = item['Boundary']['Points']
for vertex in poly:
longitude = vertex['Longitude']
latitude = vertex['Latitude']
coord_list.append((longitude, latitude))
granule_geometry = shapely.geometry.Polygon(coord_list)
# Append info for each file to dataframe
new_row = pd.DataFrame({'geometry': [granule_geometry],
'granule_id': [gran_id],
'file_url': [file_url],
'tile_id': [tile_id],
'datetime': [datetime],
'band_number': [band]
})
granule_df = pd.concat([granule_df, new_row], ignore_index=True)
return granule_df
In [15]:
@cached("download_df", False)
def do_something(results, open_results):
download_df = create_granule_df(results, open_results)
return download_df
download_df = do_something(results, open_results)
In [84]:
# Function to compute image mask
def process_tifs(download_df, huc_gdf):
"""
Load and process the Fmask and B band .tif files
in a granule.
Parameters
==========
download_df : pandas.core.groupby.generic.DataFrame
Dataframe containing all the rows for each band .tif for all the files.
huc_gdf : GeoDataFrame
Geodataframe containing the geometry of the HUC watershed used to clip
Returns
=======
accumulator_df : Pandas dataframe
Dataframe containing the processed rows for all the tifs.
An xarray has been added for each processed .tif in the last column.
"""
grouped = download_df.groupby('granule_id')
columns = ['geometry',
'granule_id',
'file_url',
'tile_id',
'datetime',
'band_number',
'processed_da']
accumulator_df = pd.DataFrame(columns=columns)
# Looping through each Granule and processing data
# Create cloud mask
for granule_id, group_data in grouped:
print("Processing data for Granule ID:", granule_id)
granule_gdf = group_data
fmask_gdf = group_data['band_number'] == "Fmask"
fmask_gdf = group_data[fmask_gdf].iloc[0:1]
file_name = (
fmask_gdf.iloc[0]['granule_id'] +
"." + fmask_gdf.iloc[0]['band_number'] + '.tif'
)
file_path = data_dir + '\\earthpy-downloads\\' + file_name
fmask_da = rxr.open_rasterio(file_path).squeeze()
huc_gdf_crs = huc_gdf.to_crs(fmask_da.rio.crs)
fmask_crop_da = fmask_da.rio.clip_box(*huc_gdf_crs.total_bounds)
mask = compute_mask(fmask_crop_da)
# Apply crop, scale factor, and mask to all "B" files
for index, row in group_data.iterrows():
if (row['band_number'].startswith('B')
and row['band_number'] not in ('B10', 'B11')):
file_name = row['granule_id'] + "." + row['band_number'] + '.tif'
file_id = row['granule_id'] + "." + row['band_number']
file_path = data_dir + '\\earthpy-downloads\\' + file_name
bband_da = rxr.open_rasterio(file_path).squeeze()
huc_gdf = huc_gdf.to_crs(bband_da.rio.crs)
bband_crop_da = bband_da.rio.clip_box(*huc_gdf_crs.total_bounds)
bband_crop_da = bband_crop_da.where(bband_crop_da >= 0, np.nan)
scale_factor = 0.0001
bband_crop_da = bband_crop_da * scale_factor
processed_da = bband_crop_da.where(mask)
add_row = pd.DataFrame({'geometry': row['geometry'],
'granule_id': row['granule_id'],
'file_url': row['file_url'],
'tile_id': row['tile_id'],
'datetime': row['datetime'],
'band_number': row['band_number'],
'processed_da': [processed_da]})
accumulator_df = pd.concat([accumulator_df, add_row])
return accumulator_df
def compute_mask(da, mask_bits = [1,2,3]):
"""
Compute a mask layer from the Fmask
Parameters
==========
da : rioxarray
An array of the Fmask layer to use to compute the mask
bits : list
A list of bits to exclude, documentation is here on page 21:
https://hls.gsfc.nasa.gov/wp-content/uploads/2019/01/HLS.v1.4.UserGuide_draft_ver3.1.pdf
Returns
=======
mask : rioxarray
An array with computed mask values
"""
# Unpack bits in the Fmask array
# Need to reverse order of bits from most significant to little
bits = np.unpackbits(da.astype(np.uint8), bitorder = 'little').reshape(da.shape + (-1,))
# Select the bits to use for the mask
mask = np.prod(bits[..., mask_bits]==0, axis=-1)
return mask
In [89]:
# Run methods to process all .tifs and create accumulator dataframe
@cached("accumulator_df", False)
def process_tifs_and_cache(download_df):
accumulator_df = process_tifs(download_df, huc_gdf)
return accumulator_df
accumulator_df = process_tifs_and_cache(download_df)
Processing data for Granule ID: HLS.L30.T11SLB.2023128T183322.v2.0 Processing data for Granule ID: HLS.L30.T11SLB.2023136T183258.v2.0 Processing data for Granule ID: HLS.L30.T11SLB.2023144T183311.v2.0 Processing data for Granule ID: HLS.L30.T11SLB.2023152T183301.v2.0 Processing data for Granule ID: HLS.L30.T11SLB.2023160T183307.v2.0 Processing data for Granule ID: HLS.L30.T11SLB.2023168T183311.v2.0 Processing data for Granule ID: HLS.L30.T11SLB.2023176T183301.v2.0 Processing data for Granule ID: HLS.L30.T11SLB.2023184T183321.v2.0 Processing data for Granule ID: HLS.L30.T11SLB.2023192T183318.v2.0 Processing data for Granule ID: HLS.L30.T11SLB.2023200T183324.v2.0 Processing data for Granule ID: HLS.L30.T11SLB.2023208T183320.v2.0 Processing data for Granule ID: HLS.L30.T11SLB.2023216T183334.v2.0 Processing data for Granule ID: HLS.L30.T11SLB.2023224T183329.v2.0 Processing data for Granule ID: HLS.L30.T11SLB.2023240T183338.v2.0 Processing data for Granule ID: HLS.L30.T11SLB.2023248T183340.v2.0 Processing data for Granule ID: HLS.L30.T11SLB.2023256T183347.v2.0 Processing data for Granule ID: HLS.L30.T11SLB.2023264T183347.v2.0 Processing data for Granule ID: HLS.L30.T11SLB.2023272T183344.v2.0
In [90]:
# Group the DataFrame by 'band_number' and 'datetime'
grouped_band_df = accumulator_df.groupby('band_number')
# Iterate through each band
band_accumulator = []
for band_number, band_data in grouped_band_df:
# Iterate over each date
date_accumulator = []
for datetime, date_data in band_data.groupby('datetime'):
# Merge data from all 4 granules
merged_date_array = rxrm.merge_arrays(date_data['processed_da'].values)
# Add datetime dimension
merged_date_array = merged_date_array.assign_coords(date=datetime)
# Mask any negative values
merged_date_array = xr.where(merged_date_array <= 0, np.nan, merged_date_array)
# Append to accumulator list
date_accumulator.append(merged_date_array)
# Concatenate the merged DataArrays along a new 'date' dimension
date_accumulator = xr.concat(date_accumulator, dim='datetime')
# Take the mean along the 'date' dimension to create a composite image
composite_image = date_accumulator.mean(dim='datetime', skipna=True)
# Add the 'band_number' as a new dimension and give the DataArray a name
composite_image = composite_image.assign_coords(band_number=band_number)
#composite_image.name = str(band_number) + "_array"
#print(composite_image.name)
band_accumulator.append(composite_image)
# Concatenate along the 'band_number' dimension
all_bands_array = xr.concat(band_accumulator, 'band_number')
In [103]:
# Organize data array of bands
all_bands_array.name = 'reflectance'
model_df = (
all_bands_array
.sel(band_number=['B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B09'])
.to_dataframe()
.reflectance
.unstack('band_number')
.dropna()
)
# Fit KMeans model to my data
k_means = KMeans(n_clusters=6)
model_df['category'] = k_means.fit_predict(model_df)
C:\Users\Pete\miniconda3\envs\earth-analytics-python\lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
In [104]:
# Prepare data to display KMeans categories
model_array = model_df.category.to_xarray()
model_plot = (model_df
.category
.to_xarray()
.sortby(['x','y'])
.hvplot(x='x', y='y', colormap='Colorblind', aspect=1, title='KMeans Categories')
)
# Prepare the data to display as RGB
rgb = all_bands_array.sel(band_number=['B04', 'B03', 'B02'])
rgb_unint8 = (rgb * 40).astype(np.uint8())
rgb_brighten = rgb_unint8 * 10
rgb_plot = (
rgb_brighten
.hvplot
.rgb(y='y', x='x', bands='band_number', aspect=1, colormap='RGB', title='RGB Aerial Image')
)
rgb_plot + model_plot
C:\Users\Pete\miniconda3\envs\earth-analytics-python\lib\site-packages\xarray\core\duck_array_ops.py:203: RuntimeWarning: invalid value encountered in cast return data.astype(dtype, **kwargs)
Out[104]: